This work is a study of the functions in the MicrosofML Package, and some of the codes are from this blog
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 649715 34.7 1168576 62.5 940480 50.3
## Vcells 1003689 7.7 2060183 15.8 1264642 9.7
###############################################
############Preparing the packages#############
###############################################
# Selectin CRAM Mirror
local({r <- getOption("repos")
r["CRAN"] <- "https://vps.fmvz.usp.br/CRAN/"
options(repos=r)})
Install_And_Load <- function(Required_Packages)
{
Remaining_Packages <- Required_Packages[!(Required_Packages %in% installed.packages()[,"Package"])];
if(length(Remaining_Packages))
{
install.packages(Remaining_Packages, dependencies = TRUE);
}
for(package_name in Required_Packages)
{
library(package_name,character.only=TRUE,quietly=TRUE);
}
}
Required_Packages =c("RODBC","ggplot2","dplyr","stringr",
"AUC","anytime","plotROC","CHAID",
"RPostgreSQL","xlsx","MicrosoftML",
"ggmap","maps","mapdata",
"NbClust","factoextra","data.table");
Install_And_Load(Required_Packages)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## AUC 0.3.0
## Type AUCNews() to see the change log and ?AUC to get an overview.
## Warning: package 'anytime' was built under R version 3.4.2
## Warning: package 'partykit' was built under R version 3.4.2
## Warning: package 'RPostgreSQL' was built under R version 3.4.2
## Warning: package 'DBI' was built under R version 3.4.2
## Warning: package 'xlsx' was built under R version 3.4.2
## Warning: package 'rJava' was built under R version 3.4.2
## Warning: package 'xlsxjars' was built under R version 3.4.2
## Google Maps API Terms of Service: http://developers.google.com/maps/terms.
## Please cite ggmap if you use it: see citation("ggmap") for details.
## Warning: package 'maps' was built under R version 3.4.2
## Warning: package 'mapdata' was built under R version 3.4.2
## Warning: package 'factoextra' was built under R version 3.4.2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
## Warning: package 'data.table' was built under R version 3.4.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
options(scipen = 9999,digits = 10)
Base<-fread('file:///D:/Disco C/Documentos/r/Aprendendo MicrosoftML/Amostra.csv',sep=';',dec='.')
##
Read 6.9% of 1731585 rows
Read 15.6% of 1731585 rows
Read 25.4% of 1731585 rows
Read 34.1% of 1731585 rows
Read 43.3% of 1731585 rows
Read 53.1% of 1731585 rows
Read 62.4% of 1731585 rows
Read 69.9% of 1731585 rows
Read 78.5% of 1731585 rows
Read 86.0% of 1731585 rows
Read 95.9% of 1731585 rows
Read 1731585 rows and 21 (of 21) columns from 0.307 GB file in 00:00:14
glimpse(Base)
## Observations: 1,731,585
## Variables: 21
## $ medallion <chr> "1DDB1255470A78637646E29BD7053C5C", "1DDB12...
## $ hack_license <chr> "A0EF5AB4B697801EC119EFA6B60ECF19", "A0EF5A...
## $ vendor_id <chr> "VTS", "VTS", "VTS", "VTS", "VTS", "VTS", "...
## $ rate_code <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ store_and_fwd_flag <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pickup_datetime <chr> "2013-05-05 16:52:00", "2013-05-07 08:43:00...
## $ dropoff_datetime <chr> "2013-05-05 16:59:00", "2013-05-07 08:59:00...
## $ passenger_count <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...
## $ trip_time_in_secs <int> 420, 960, 780, 180, 600, 2760, 480, 480, 11...
## $ trip_distance <dbl> 1.22, 1.13, 2.47, 0.43, 2.04, 10.05, 1.27, ...
## $ pickup_longitude <dbl> -73.9683, -73.9634, -73.9888, -73.9530, -73...
## $ pickup_latitude <dbl> 40.7626, 40.7656, 40.7725, 40.7831, 40.8022...
## $ dropoff_longitude <dbl> -73.9805, -73.9762, -73.9775, -73.9560, -73...
## $ dropoff_latitude <dbl> 40.7708, 40.7574, 40.7535, 40.7777, 40.7777...
## $ payment_type <chr> "CRD", "CRD", "CRD", "CRD", "CSH", "CRD", "...
## $ fare_amount <dbl> 7.0, 10.5, 11.5, 4.5, 9.5, 38.5, 7.0, 8.0, ...
## $ surcharge <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...
## $ mta_tax <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5...
## $ tip_amount <dbl> 1.40, 0.50, 3.45, 0.90, 0.00, 7.70, 1.40, 1...
## $ tolls_amount <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 5.33, 0.00, 0...
## $ total_amount <dbl> 8.90, 11.50, 15.45, 5.90, 10.00, 52.03, 8.9...
Base$pickup_datetime<-as.POSIXct(Base$pickup_datetime,"%Y-%m-%d %H:%M:%S",tz='America/New_York')
Base$dropoff_datetime<-as.POSIXct(Base$dropoff_datetime,"%Y-%m-%d %H:%M:%S",tz='America/New_York')
head(Base)
## medallion hack_license
## 1: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 2: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 3: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 4: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 5: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 6: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1: VTS 1 NA 2013-05-05 16:52:00
## 2: VTS 1 NA 2013-05-07 08:43:00
## 3: VTS 1 NA 2013-06-07 06:01:00
## 4: VTS 1 NA 2013-06-10 09:23:00
## 5: VTS 1 NA 2013-06-11 13:53:00
## 6: VTS 1 NA 2013-06-12 14:48:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1: 2013-05-05 16:59:00 6 420 1.22
## 2: 2013-05-07 08:59:00 6 960 1.13
## 3: 2013-06-07 06:14:00 6 780 2.47
## 4: 2013-06-10 09:26:00 6 180 0.43
## 5: 2013-06-11 14:03:00 6 600 2.04
## 6: 2013-06-12 15:34:00 6 2760 10.05
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1: -73.9683 40.7626 -73.9805 40.7708
## 2: -73.9634 40.7656 -73.9762 40.7574
## 3: -73.9888 40.7725 -73.9775 40.7535
## 4: -73.9530 40.7831 -73.9560 40.7777
## 5: -73.9683 40.8022 -73.9795 40.7777
## 6: -73.9664 40.7530 -73.8618 40.7684
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1: CRD 7.0 0 0.5 1.40 0.00
## 2: CRD 10.5 0 0.5 0.50 0.00
## 3: CRD 11.5 0 0.5 3.45 0.00
## 4: CRD 4.5 0 0.5 0.90 0.00
## 5: CSH 9.5 0 0.5 0.00 0.00
## 6: CRD 38.5 0 0.5 7.70 5.33
## total_amount
## 1: 8.90
## 2: 11.50
## 3: 15.45
## 4: 5.90
## 5: 10.00
## 6: 52.03
tail(Base)
## medallion hack_license
## 1: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 2: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 3: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 4: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 5: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## 6: 1DDB1255470A78637646E29BD7053C5C A0EF5AB4B697801EC119EFA6B60ECF19
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1: VTS 1 NA 2013-04-02 10:28:00
## 2: VTS 1 NA 2013-04-04 14:38:00
## 3: VTS 1 NA 2013-04-04 15:52:00
## 4: VTS 1 NA 2013-04-06 10:50:00
## 5: VTS 1 NA 2013-04-10 09:22:00
## 6: VTS 1 NA 2013-04-30 11:09:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1: 2013-04-02 10:51:00 6 1380 3.11
## 2: 2013-04-04 14:47:00 6 540 1.54
## 3: 2013-04-04 16:10:00 6 1080 2.16
## 4: 2013-04-06 11:00:00 6 600 2.43
## 5: 2013-04-10 09:38:00 6 960 1.71
## 6: 2013-04-30 11:24:00 6 900 2.00
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1: -74.0019 40.7244 -73.9708 40.7558
## 2: -73.9782 40.7631 -73.9885 40.7791
## 3: -73.9961 40.7261 -73.9778 40.7506
## 4: -73.9986 40.7279 -73.9719 40.7460
## 5: -73.9779 40.7524 -73.9961 40.7466
## 6: -73.9618 40.7675 -73.9813 40.7471
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1: CSH 16.5 0 0.5 0 0
## 2: CSH 8.0 0 0.5 0 0
## 3: CSH 13.0 0 0.5 0 0
## 4: CSH 10.0 0 0.5 0 0
## 5: CSH 11.5 0 0.5 0 0
## 6: CRD 11.0 0 0.5 1 0
## total_amount
## 1: 17.0
## 2: 8.5
## 3: 13.5
## 4: 10.5
## 5: 12.0
## 6: 12.5
glimpse(Base)
## Observations: 1,731,585
## Variables: 21
## $ medallion <chr> "1DDB1255470A78637646E29BD7053C5C", "1DDB12...
## $ hack_license <chr> "A0EF5AB4B697801EC119EFA6B60ECF19", "A0EF5A...
## $ vendor_id <chr> "VTS", "VTS", "VTS", "VTS", "VTS", "VTS", "...
## $ rate_code <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ store_and_fwd_flag <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ pickup_datetime <dttm> 2013-05-05 16:52:00, 2013-05-07 08:43:00, ...
## $ dropoff_datetime <dttm> 2013-05-05 16:59:00, 2013-05-07 08:59:00, ...
## $ passenger_count <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...
## $ trip_time_in_secs <int> 420, 960, 780, 180, 600, 2760, 480, 480, 11...
## $ trip_distance <dbl> 1.22, 1.13, 2.47, 0.43, 2.04, 10.05, 1.27, ...
## $ pickup_longitude <dbl> -73.9683, -73.9634, -73.9888, -73.9530, -73...
## $ pickup_latitude <dbl> 40.7626, 40.7656, 40.7725, 40.7831, 40.8022...
## $ dropoff_longitude <dbl> -73.9805, -73.9762, -73.9775, -73.9560, -73...
## $ dropoff_latitude <dbl> 40.7708, 40.7574, 40.7535, 40.7777, 40.7777...
## $ payment_type <chr> "CRD", "CRD", "CRD", "CRD", "CSH", "CRD", "...
## $ fare_amount <dbl> 7.0, 10.5, 11.5, 4.5, 9.5, 38.5, 7.0, 8.0, ...
## $ surcharge <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...
## $ mta_tax <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5...
## $ tip_amount <dbl> 1.40, 0.50, 3.45, 0.90, 0.00, 7.70, 1.40, 1...
## $ tolls_amount <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 5.33, 0.00, 0...
## $ total_amount <dbl> 8.90, 11.50, 15.45, 5.90, 10.00, 52.03, 8.9...
summary(Base)
## medallion hack_license vendor_id
## Length:1731585 Length:1731585 Length:1731585
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## rate_code store_and_fwd_flag pickup_datetime
## Min. : 0.000000 Length:1731585 Min. :2013-01-01 00:11:00
## 1st Qu.: 1.000000 Class :character 1st Qu.:2013-03-29 22:43:00
## Median : 1.000000 Mode :character Median :2013-06-24 15:32:00
## Mean : 1.033811 Mean :2013-06-29 13:56:00
## 3rd Qu.: 1.000000 3rd Qu.:2013-09-30 13:27:48
## Max. :210.000000 Max. :2013-12-31 23:58:25
##
## dropoff_datetime passenger_count trip_time_in_secs
## Min. :2013-01-01 00:11:00 Min. :0.000000 Min. : -10.000
## 1st Qu.:2013-03-29 22:57:00 1st Qu.:1.000000 1st Qu.: 360.000
## Median :2013-06-24 15:44:00 Median :1.000000 Median : 600.000
## Mean :2013-06-29 14:08:34 Mean :1.708797 Mean : 800.021
## 3rd Qu.:2013-09-30 13:41:00 3rd Qu.:2.000000 3rd Qu.: 960.000
## Max. :2014-01-01 00:34:00 Max. :9.000000 Max. :4294905.000
##
## trip_distance pickup_longitude pickup_latitude
## Min. : 0.000 Min. :-1517.10000 Min. :-180.00000
## 1st Qu.: 1.040 1st Qu.: -73.99220 1st Qu.: 40.73450
## Median : 1.800 Median : -73.98190 Median : 40.75230
## Mean : 6.018 Mean : -72.44473 Mean : 39.74892
## 3rd Qu.: 3.200 3rd Qu.: -73.96680 3rd Qu.: 40.76710
## Max. :5331800.000 Max. : 40.85680 Max. :3210.36000
##
## dropoff_longitude dropoff_latitude payment_type
## Min. :-740.17300 Min. :-2497.70000 Length:1731585
## 1st Qu.: -73.99150 1st Qu.: 40.73350 Class :character
## Median : -73.98020 Median : 40.75270 Mode :character
## Mean : -72.40124 Mean : 39.72285
## 3rd Qu.: -73.96360 3rd Qu.: 40.76780
## Max. : 40.90470 Max. : 2302.58000
## NA's :20 NA's :20
## fare_amount surcharge mta_tax
## Min. :-79.00000 Min. :-1.0000000 Min. :-0.5000000
## 1st Qu.: 6.50000 1st Qu.: 0.0000000 1st Qu.: 0.5000000
## Median : 9.50000 Median : 0.0000000 Median : 0.5000000
## Mean : 12.35104 Mean : 0.3202457 Mean : 0.4981956
## 3rd Qu.: 14.00000 3rd Qu.: 0.5000000 3rd Qu.: 0.5000000
## Max. :620.01000 Max. :28.0000000 Max. : 2.5000000
##
## tip_amount tolls_amount total_amount
## Min. : 0.000000 Min. : 0.0000000 Min. :-79.00000
## 1st Qu.: 0.000000 1st Qu.: 0.0000000 1st Qu.: 8.00000
## Median : 1.000000 Median : 0.0000000 Median : 11.00000
## Mean : 1.365235 Mean : 0.2504534 Mean : 14.78529
## 3rd Qu.: 2.000000 3rd Qu.: 0.0000000 3rd Qu.: 16.50000
## Max. :182.450000 Max. :45.0000000 Max. :620.01000
##
Base%>%with(length(unique(medallion)))
## [1] 13524
Base%>%with(length(unique(hack_license)))
## [1] 39051
Base%>%group_by(store_and_fwd_flag)%>%summarise(Qnt=n())%>%mutate(`%`=Qnt/sum(Qnt))
## Warning: package 'bindrcpp' was built under R version 3.4.2
## # A tibble: 3 x 3
## store_and_fwd_flag Qnt `%`
## <chr> <int> <dbl>
## 1 N 849077 0.49034670548
## 2 Y 20899 0.01206928912
## 3 <NA> 861609 0.49758400541
Base%>%group_by(payment_type)%>%summarise(Qnt=n())%>%mutate(`%`=Qnt/sum(Qnt))
## # A tibble: 5 x 3
## payment_type Qnt `%`
## <chr> <int> <dbl>
## 1 CRD 931290 0.5378251717357
## 2 CSH 792780 0.4578348738295
## 3 DIS 1302 0.0007519122654
## 4 NOC 4169 0.0024076207636
## 5 UNK 2044 0.0011804214058
Base%>%group_by(Year_Month=format(pickup_datetime,'%Y-%m'))%>%summarise(Mean_trip_Time=mean(trip_time_in_secs),Qnt=n())%>%mutate(`%`=Qnt/sum(Qnt))
## # A tibble: 12 x 4
## Year_Month Mean_trip_Time Qnt `%`
## <chr> <dbl> <int> <dbl>
## 1 2013-01 681.9155664 147311 0.08507292452
## 2 2013-02 701.4072688 141592 0.08177017010
## 3 2013-03 718.7444048 154150 0.08902248518
## 4 2013-04 744.9185972 150732 0.08704857111
## 5 2013-05 777.9325432 156174 0.09019135647
## 6 2013-06 778.4977133 144093 0.08321451156
## 7 2013-07 750.7912189 137249 0.07926206337
## 8 2013-08 1405.1602936 125189 0.07229734607
## 9 2013-09 784.3848638 144277 0.08332077259
## 10 2013-10 780.3568885 150257 0.08677425596
## 11 2013-11 775.5212709 143553 0.08290265855
## 12 2013-12 788.8799048 137008 0.07912288452
Base%>%filter(pickup_longitude==min(pickup_longitude))
## medallion hack_license
## 1 0C4726D4E2AF94BF8FE2D23EFDA20917 42CC3A49C66FA772567BBAB7F85F8835
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 VTS 1 <NA> 2013-02-05 10:17:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-02-05 10:59:00 1 2520 11.59
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -1517.1 40.769 -73.9827 40.7623
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 UNK 38.5 0 0.5 8.76 4.8
## total_amount
## 1 52.56
Base%>%filter(pickup_longitude==max(pickup_longitude))
## medallion hack_license
## 1 5DE2E0B9BB72C1407330C14A0886EB85 D4DC28CC815084D8F55B56C4FA379C22
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 VTS 1 <NA> 2013-05-26 00:18:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-05-26 00:41:00 5 1380 5.43
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 40.8568 -73.9337 40.8002 -73.9708
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CRD 21 0.5 0.5 0 0
## total_amount
## 1 22
Base%>%filter(pickup_latitude==min(pickup_latitude))
## medallion hack_license
## 1 A485E6CAA482169A2A3837DEC32AEBAB 7B19DE6D4D54999531BEB27F758F71F6
## 2 A485E6CAA482169A2A3837DEC32AEBAB 7B19DE6D4D54999531BEB27F758F71F6
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 VTS 5 <NA> 2013-11-27 12:31:00
## 2 VTS 5 <NA> 2013-11-27 13:01:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-11-27 12:31:00 1 0 0
## 2 2013-11-27 13:01:00 1 0 0
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -180 -180 -180 -180
## 2 -180 -180 -180 -180
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CSH 99.99 0 0 0 0
## 2 CSH 333.38 0 0 0 0
## total_amount
## 1 99.99
## 2 333.38
Base%>%filter(pickup_latitude==max(pickup_latitude))
## medallion hack_license
## 1 B602FCD21BD17941F53B6B61500A5DA2 5276DE98232EBA65D215BEBA6C0EBB31
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 VTS 1 <NA> 2013-03-18 06:51:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-03-18 06:52:00 4 60 0.43
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 0 3210.36 -0.29 -2497.7
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CRD 3.5 0 0.5 2 0
## total_amount
## 1 6
Base%>%filter(dropoff_latitude==min(dropoff_latitude))
## [1] medallion hack_license vendor_id
## [4] rate_code store_and_fwd_flag pickup_datetime
## [7] dropoff_datetime passenger_count trip_time_in_secs
## [10] trip_distance pickup_longitude pickup_latitude
## [13] dropoff_longitude dropoff_latitude payment_type
## [16] fare_amount surcharge mta_tax
## [19] tip_amount tolls_amount total_amount
## <0 rows> (or 0-length row.names)
Base%>%filter(dropoff_longitude==max(dropoff_longitude))
## [1] medallion hack_license vendor_id
## [4] rate_code store_and_fwd_flag pickup_datetime
## [7] dropoff_datetime passenger_count trip_time_in_secs
## [10] trip_distance pickup_longitude pickup_latitude
## [13] dropoff_longitude dropoff_latitude payment_type
## [16] fare_amount surcharge mta_tax
## [19] tip_amount tolls_amount total_amount
## <0 rows> (or 0-length row.names)
Base%>%filter(dropoff_latitude==min(dropoff_latitude))
## [1] medallion hack_license vendor_id
## [4] rate_code store_and_fwd_flag pickup_datetime
## [7] dropoff_datetime passenger_count trip_time_in_secs
## [10] trip_distance pickup_longitude pickup_latitude
## [13] dropoff_longitude dropoff_latitude payment_type
## [16] fare_amount surcharge mta_tax
## [19] tip_amount tolls_amount total_amount
## <0 rows> (or 0-length row.names)
Base%>%filter(dropoff_latitude==max(dropoff_latitude))
## [1] medallion hack_license vendor_id
## [4] rate_code store_and_fwd_flag pickup_datetime
## [7] dropoff_datetime passenger_count trip_time_in_secs
## [10] trip_distance pickup_longitude pickup_latitude
## [13] dropoff_longitude dropoff_latitude payment_type
## [16] fare_amount surcharge mta_tax
## [19] tip_amount tolls_amount total_amount
## <0 rows> (or 0-length row.names)
Base%>%filter(passenger_count==max(passenger_count))
## medallion hack_license
## 1 780189E2E3A40092B007D2D708791D22 D3471987DBEE5575F367BA8B74D84E8C
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 CMT 1 N 2013-09-23 19:50:40
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-09-23 19:56:51 9 371 1.5
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -73.9687 40.7909 -73.9582 40.8105
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CRD 7 1 0.5 1.7 0
## total_amount
## 1 10.2
Base%>%filter(trip_distance==max(trip_distance))
## medallion hack_license
## 1 3DCFC9097488D3F93982ED4A899C6E24 0B21B07994F53C937E2687A81488A3C5
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 CMT 1 Y 2013-08-07 15:09:17
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-08-07 15:17:07 1 470 5331800
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -73.9805 40.7509 -73.9797 40.7444
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CRD 7 0 0.5 1.5 0
## total_amount
## 1 9
Base%>%filter(trip_time_in_secs==max(trip_time_in_secs))
## medallion hack_license
## 1 99115D1EA0AE33939899D652DCC34089 B508465FAC4F54A40CFDBB2B69707F5A
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 CMT 1 N 2013-08-04 03:27:21
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-08-04 09:48:39 1 4294905 16
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -74.0087 40.7115 -74.0106 40.7125
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CSH 44 0 0.5 0 0
## total_amount
## 1 44.5
Base%>%filter(fare_amount==max(fare_amount))
## medallion hack_license
## 1 AC7A85219867AB060609BA214C124969 8FB6EB354A3D5986F098EEC2642D1340
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 CMT 5 N 2013-08-21 00:11:07
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-08-21 00:13:04 1 117 0.4
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -74.0428 40.7201 -74.0358 40.7174
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 DIS 620.01 0 0 0 0
## total_amount
## 1 620.01
Base%>%filter(surcharge==max(surcharge))
## medallion hack_license
## 1 039A16D739D799891C9211A55F731263 4626C9FE17E1AAD39BC797B3F6417C78
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 CMT 1 N 2013-08-25 17:18:14
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-08-25 17:19:17 1 63 0.1
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -73.9933 40.7539 -73.988 40.7517
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 NOC 3 28 0.5 0 0
## total_amount
## 1 31.5
Base%>%filter(tip_amount==max(tip_amount))
## medallion hack_license
## 1 848AC8C699E0BE2A7A04605006116745 C6C54D46C2272CD5C69BD2E6502B5D1C
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 VTS 1 <NA> 2013-01-23 21:39:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-01-23 21:46:00 1 420 1.43
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -73.995 40.74 -73.9895 40.7556
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CRD 7 0.5 0.5 182.45 0
## total_amount
## 1 190.45
Base%>%filter(tolls_amount==max(tolls_amount))
## medallion hack_license
## 1 77D0801A31D1326C3D71CB8E5611D30C AE103F4D67534DCF2A19B6EC0E8CAEBD
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 VTS 1 <NA> 2013-08-26 14:07:00
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-08-26 14:20:00 3 780 1.13
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -73.8874 40.7511 -73.8944 40.7578
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 CRD 9.5 0 0.5 0 45
## total_amount
## 1 55
Base%>%filter(total_amount==max(total_amount))
## medallion hack_license
## 1 AC7A85219867AB060609BA214C124969 8FB6EB354A3D5986F098EEC2642D1340
## vendor_id rate_code store_and_fwd_flag pickup_datetime
## 1 CMT 5 N 2013-08-21 00:11:07
## dropoff_datetime passenger_count trip_time_in_secs trip_distance
## 1 2013-08-21 00:13:04 1 117 0.4
## pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
## 1 -74.0428 40.7201 -74.0358 40.7174
## payment_type fare_amount surcharge mta_tax tip_amount tolls_amount
## 1 DIS 620.01 0 0 0 0
## total_amount
## 1 620.01
#########################
#NYC extreme coordinates#
#North:40.917577#########
#South:40.477399#########
#East: -73.700272########
#West:-74.259090#########
#########################
Base<-Base%>%filter(!((pickup_latitude<0|dropoff_latitude<0)|(pickup_longitude>0|dropoff_longitude>0)))
Base<-Base%>%filter((pickup_longitude>-74.259090 & pickup_longitude< -73.700272) &
(dropoff_longitude>-74.259090 & dropoff_longitude< -73.700272) &
(dropoff_latitude>40.477399 & dropoff_latitude< 40.917577) &
(pickup_latitude>40.477399 & pickup_latitude< 40.917577))
NYC_Map<-get_map(c(-74.259090,40.477399,-73.700272,40.917577),zoom = 10,maptype = 'hybrid')
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=40.697488,-73.979681&zoom=10&size=640x640&scale=2&maptype=hybrid&language=en-EN
############
###Pickup###
############
NYC_Map%>%
ggmap()+
geom_point(data=Base,aes(x=pickup_longitude,y=pickup_latitude),color='blue',size=.5)
#############
###Dropoff###
#############
NYC_Map%>%
ggmap()+
geom_point(data=Base,aes(x=dropoff_longitude,y=dropoff_latitude),color='green',size=.5)
I identified more than 90 differents cluster, by the pickup and dropoff coordinates, but it didn’t bring any practical improvment in the analysis, so i’m going to supress this part.
Base<-Base%>%filter(trip_time_in_secs>0)
Base<-Base%>%mutate(Flag_Tip=0)
Base$Flag_Tip[Base$tip_amount>0]<-1
Base$Flag_Tip_Label<-'Not Tipped'
Base$Flag_Tip_Label[Base$Flag_Tip==1]<-'Tipped'
Base$store_and_fwd_flag_Label<-'Maybe'
Base$store_and_fwd_flag_Label[Base$store_and_fwd_flag=='Y']<-'Yes'
Base$store_and_fwd_flag_Label[Base$store_and_fwd_flag=='N']<-'No'
Base$store_and_fwd_flag_Label<-factor(Base$store_and_fwd_flag_Label,levels = c('Yes','Maybe','No'))
Base<-Base%>%mutate(trip_distance_01=(trip_distance-min(trip_distance))/(max(trip_distance)-min(trip_distance)),
trip_time_in_secs_01=(trip_time_in_secs-min(trip_time_in_secs))/(max(trip_time_in_secs)-min(trip_time_in_secs)))
Base%>%select(Flag_Tip_Label,store_and_fwd_flag_Label)%>%
with(table(Flag_Tip_Label,store_and_fwd_flag_Label))%>%
prop.table(1)%>%addmargins(2)
## store_and_fwd_flag_Label
## Flag_Tip_Label Yes Maybe No Sum
## Not Tipped 0.01238479798 0.49399643141 0.49361877061 1.00000000000
## Tipped 0.01178600433 0.49868029237 0.48953370330 1.00000000000
Base%>%select(Flag_Tip_Label,passenger_count)%>%
with(table(Flag_Tip_Label,passenger_count))%>%
prop.table(1)%>%addmargins(2)
## passenger_count
## Flag_Tip_Label 0 1 2
## Not Tipped 0.000000000000000 0.686359510799242 0.144655232830983
## Tipped 0.000004511820971 0.719942248691572 0.128773010286952
## passenger_count
## Flag_Tip_Label 3 4 5
## Not Tipped 0.047738802357099 0.025423382528297 0.056953726026278
## Tipped 0.037356749684173 0.016233531853456 0.058106614329543
## passenger_count
## Flag_Tip_Label 6 8 9
## Not Tipped 0.038869345458103 0.000000000000000 0.000000000000000
## Tipped 0.039579949467605 0.000002255910485 0.000001127955243
## passenger_count
## Flag_Tip_Label Sum
## Not Tipped 1.000000000000000
## Tipped 1.000000000000000
Base%>%select(Flag_Tip_Label,mta_tax)%>%
with(table(Flag_Tip_Label,mta_tax))%>%
prop.table(1)%>%addmargins(2)
## mta_tax
## Flag_Tip_Label -0.5 0 0.5
## Not Tipped 0.00004457635744 0.00232416174779 0.99763126189477
## Tipped 0.00000000000000 0.00332634001083 0.99667365998917
## mta_tax
## Flag_Tip_Label Sum
## Not Tipped 1.00000000000000
## Tipped 1.00000000000000
Base%>%group_by(store_and_fwd_flag_Label)%>%summarise(Qnt=n(),Tip_Ratio=mean(Flag_Tip))%>%mutate(Freq=Qnt/sum(Qnt))
## # A tibble: 3 x 4
## store_and_fwd_flag_Label Qnt Tip_Ratio Freq
## <fctr> <int> <dbl> <dbl>
## 1 Yes 20451 0.5109285610 0.01207144767
## 2 Maybe 841063 0.5256562231 0.49644750830
## 3 No 832649 0.5212292334 0.49148104403
Base%>%group_by(payment_type)%>%summarise(Qnt=n(),Tip_Ratio=mean(Flag_Tip))%>%mutate(Freq=Qnt/sum(Qnt))
## # A tibble: 5 x 4
## payment_type Qnt Tip_Ratio Freq
## <chr> <int> <dbl> <dbl>
## 1 CRD 911995 0.96992308071864 0.5383159707773
## 2 CSH 774891 0.00009033528587 0.4573886928235
## 3 DIS 1279 0.00781860828772 0.0007549450673
## 4 NOC 4057 0.00295785062854 0.0023946928365
## 5 UNK 1941 0.98042246264812 0.0011456984954
Base%>%group_by(Flag_Tip_Label)%>%summarise(Passenger_Ratio=mean(passenger_count))
## # A tibble: 2 x 2
## Flag_Tip_Label Passenger_Ratio
## <chr> <dbl>
## 1 Not Tipped 1.738564617
## 2 Tipped 1.682533613
Base%>%group_by(Flag_Tip_Label)%>%
summarise(MinDistance=min(trip_distance),
Q1distance=quantile(trip_distance,.25),
Q2distance=quantile(trip_distance,.5),
meanDistance=mean(trip_distance),
Q3distance=quantile(trip_distance,.75),
Maxdistance=max(trip_distance),
VarDistance=var(trip_distance),
DPDistance=sd(trip_distance),
CVDistance=sd(trip_distance)/mean(trip_distance))
## # A tibble: 2 x 10
## Flag_Tip_Label MinDistance Q1distance Q2distance meanDistance Q3distance
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Not Tipped 0 0.95 1.60 2.677764025 2.9
## 2 Tipped 0 1.19 1.99 9.190742578 3.5
## # ... with 4 more variables: Maxdistance <dbl>, VarDistance <dbl>,
## # DPDistance <dbl>, CVDistance <dbl>
Base%>%group_by(Flag_Tip_Label)%>%
summarise(MinTime=min(trip_time_in_secs),
Q1Time=quantile(trip_time_in_secs,.25),
Q2Time=quantile(trip_time_in_secs,.5),
meanTime=mean(trip_time_in_secs),
Q3Time=quantile(trip_time_in_secs,.75),
MaxTime=max(trip_time_in_secs),
VarTime=var(trip_time_in_secs),
DPTime=sd(trip_time_in_secs),
CVTime=sd(trip_time_in_secs)/mean(trip_time_in_secs))
## # A tibble: 2 x 10
## Flag_Tip_Label MinTime Q1Time Q2Time meanTime Q3Time MaxTime
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Not Tipped 1 347 559 778.8271416 900 4294905
## 2 Tipped 1 420 660 819.1549529 1020 4293588
## # ... with 3 more variables: VarTime <dbl>, DPTime <dbl>, CVTime <dbl>
Base%>%group_by(Flag_Tip_Label)%>%
summarise(MinFare=min(fare_amount),
Q1Fare=quantile(fare_amount,.25),
Q2Fare=quantile(fare_amount,.5),
meanFare=mean(fare_amount),
Q3Fare=quantile(fare_amount,.75),
MaxFare=max(fare_amount),
VarFare=var(fare_amount),
DPFare=sd(fare_amount),
CVFare=sd(fare_amount)/mean(fare_amount))
## # A tibble: 2 x 10
## Flag_Tip_Label MinFare Q1Fare Q2Fare meanFare Q3Fare MaxFare
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Not Tipped -52 6 8.5 11.48785553 13 620.01
## 2 Tipped 0 7 10.0 12.98928961 15 400.00
## # ... with 3 more variables: VarFare <dbl>, DPFare <dbl>, CVFare <dbl>
Base%>%group_by(Flag_Tip_Label)%>%
summarise(MinSurcharge=min(surcharge),
Q1Surcharge=quantile(surcharge,.25),
Q2Surcharge=quantile(surcharge,.5),
meanSurcharge=mean(surcharge),
Q3Surcharge=quantile(surcharge,.75),
MaxSurcharge=max(surcharge),
VarSurcharge=var(surcharge),
DPSurcharge=sd(surcharge),
CVSurcharge=sd(surcharge)/mean(surcharge))
## # A tibble: 2 x 10
## Flag_Tip_Label MinSurcharge Q1Surcharge Q2Surcharge meanSurcharge
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Not Tipped -1 0 0.0 0.3132306344
## 2 Tipped 0 0 0.5 0.3274646950
## # ... with 5 more variables: Q3Surcharge <dbl>, MaxSurcharge <dbl>,
## # VarSurcharge <dbl>, DPSurcharge <dbl>, CVSurcharge <dbl>
Base%>%group_by(Flag_Tip_Label)%>%
summarise(MinToll=min(tolls_amount),
Q1Toll=quantile(tolls_amount,.25),
Q2Toll=quantile(tolls_amount,.5),
meanToll=mean(tolls_amount),
Q3Toll=quantile(tolls_amount,.75),
MaxToll=max(tolls_amount),
VarToll=var(tolls_amount),
DPToll=sd(tolls_amount),
CVToll=sd(tolls_amount)/mean(tolls_amount))
## # A tibble: 2 x 10
## Flag_Tip_Label MinToll Q1Toll Q2Toll meanToll Q3Toll MaxToll
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Not Tipped 0 0 0 0.1894105148 0 45.00
## 2 Tipped 0 0 0 0.3027054006 0 24.31
## # ... with 3 more variables: VarToll <dbl>, DPToll <dbl>, CVToll <dbl>
Base%>%group_by(Flag_Tip_Label)%>%
summarise(MinTotal=min(total_amount),
Q1Total=quantile(total_amount,.25),
Q2Total=quantile(total_amount,.5),
meanTotal=mean(total_amount),
Q3Total=quantile(total_amount,.75),
MaxTotal=max(total_amount),
VarTotal=var(total_amount),
DPTotal=sd(total_amount),
CVTotal=sd(total_amount)/mean(total_amount))
## # A tibble: 2 x 10
## Flag_Tip_Label MinTotal Q1Total Q2Total meanTotal Q3Total MaxTotal
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Not Tipped -52.50 7.0 9.5 12.48940600 14.0 620.01
## 2 Tipped 3.01 9.5 12.6 16.70995896 18.6 480.00
## # ... with 3 more variables: VarTotal <dbl>, DPTotal <dbl>, CVTotal <dbl>
Base%>%ggplot(aes(x=store_and_fwd_flag_Label))+
geom_bar(aes(y=100*(..count../sum(..count..))))+
facet_grid(.~Flag_Tip_Label)+
xlab('store and fwd')+
ylab('%')
Base%>%ggplot(aes(x=payment_type))+
geom_bar(aes(y=100*(..count../sum(..count..))))+
facet_grid(.~Flag_Tip_Label)+
xlab('Payment type')+
ylab('%')
Base%>%ggplot(aes(x=passenger_count))+
geom_bar(aes(y=100*(..count../sum(..count..))))+
scale_x_continuous(breaks=c(0,1,2,3,4,5,6,7,8,9))+
facet_grid(.~Flag_Tip_Label)+
xlab('Passenger Count')+
ylab('%')
Base%>%ggplot(aes(x=mta_tax))+
geom_bar(aes(y=100*(..count../sum(..count..))))+
scale_x_continuous(breaks=c(-0.5,0,0.5))+
facet_grid(.~Flag_Tip_Label)+
xlab('Mta Tax')+
ylab('%')
Base%>%ggplot(aes(x=Flag_Tip_Label,y=trip_distance_01))+
geom_boxplot()+
xlab('Tipped')+
ylab('Trip distance transformed to [0,1]')+
scale_y_continuous(limits = c(0,.0000025))
## Warning: Removed 43452 rows containing non-finite values (stat_boxplot).
Base%>%ggplot(aes(x=trip_distance_01, fill=Flag_Tip_Label))+
geom_density(alpha=.2)+
scale_x_continuous(limits = c(0,.0000025))+
xlab('Trip distance transformed to [0,1]')+
ylab('Density')
## Warning: Removed 43452 rows containing non-finite values (stat_density).
Base%>%ggplot(aes(x=Flag_Tip_Label,y=trip_time_in_secs_01))+
geom_boxplot()+
xlab('Tipped')+
ylab('Trip time in secs transformed to [0,1]')+
scale_y_continuous(limits = c(0,.00045))
## Warning: Removed 65917 rows containing non-finite values (stat_boxplot).
Base%>%ggplot(aes(x=trip_time_in_secs_01,fill=Flag_Tip_Label))+
geom_density(alpha=.2)+
ylab('Density')+
xlab('Trip time in secs transformed to [0,1]')+
scale_x_continuous(limits = c(0,.00045))
## Warning: Removed 65917 rows containing non-finite values (stat_density).
Base%>%ggplot(aes(x=Flag_Tip_Label,y=fare_amount))+
geom_boxplot()+
xlab('Tipped')+
ylab('Fare Amount')+
scale_y_continuous(limits = c(0,100))
## Warning: Removed 305 rows containing non-finite values (stat_boxplot).
Base%>%ggplot(aes(x=fare_amount,fill=Flag_Tip_Label))+
geom_density(alpha=.2)+
ylab('Density')+
xlab('Fare Amount')+
scale_x_continuous(limits = c(0,100))
## Warning: Removed 305 rows containing non-finite values (stat_density).
Base%>%ggplot(aes(x=Flag_Tip_Label,y=surcharge))+
geom_boxplot()+
xlab('Tipped')+
ylab('Surcharge')+
scale_y_continuous(limits = c(0,1.5))
## Warning: Removed 31 rows containing non-finite values (stat_boxplot).
Base%>%ggplot(aes(x=surcharge,fill=Flag_Tip_Label))+
geom_density(alpha=.2)+
ylab('Density')+
xlab('Surcharge')+
scale_x_continuous(limits = c(0,1.5))
## Warning: Removed 31 rows containing non-finite values (stat_density).
Base%>%ggplot(aes(y=tolls_amount,x=Flag_Tip_Label))+
geom_point()+
ylab('Tolls Amount')+
xlab('Tipped')
Base%>%ggplot(aes(y=total_amount,x=Flag_Tip_Label))+
geom_point()+
ylab('Total Amount')+
xlab('Tipped')
This part of the study ends up to be a kind of variable selection, a.k.a feature selection, not an automated way, but a very interesting way to do it.
It was possible to see how the Tipped variabel, Flag_Tip_Label interacts with all the other variabels, except the ids variabels medallion, hack_license and vendor_id.
The original post states a formula, explaining the Tipped variabel through passenger_count, trip_time_in_secs, trip_distance and total_amount variabels.
I thick that the payment_tipe variabel also should to be in the model, maybe I wuold compare this new sugestion with the first one.
From here, the code is the same one founded in the original post
set.seed(2345, "L'Ecuyer-CMRG")
# Randomly split the data 75-25 between train and test sets.
dataProb <- c(Train = 0.75, Test = 0.25)
dataSplit <-
rxSplit(Base,
splitByFactor = "splitVar",
transforms = list(splitVar =
sample(dataFactor,
size = .rxNumRows,
replace = TRUE,
prob = dataProb)),
transformObjects =
list(dataProb = dataProb,
dataFactor = factor(names(dataProb),
levels = names(dataProb))),
outFilesBase = tempfile())
## Rows Read: 1694163, Total Rows Processed: 1694163, Total Chunk Time: 3.730 seconds
## Rows Read: 1694163, Total Rows Processed: 1694163Rows Read: 1269673, Total Rows Processed: 1269673, Total Chunk Time: 2.653 seconds
## Rows Read: 424490, Total Rows Processed: 424490, Total Chunk Time: 0.933 seconds
## , Total Chunk Time: 15.984 seconds
# Name the train and test datasets.
dataTrain <- dataSplit[[1]]
dataTest <- dataSplit[[2]]
rxSummary(~ Flag_Tip, dataTrain)$sDataFrame
## Rows Read: 1269673, Total Rows Processed: 1269673, Total Chunk Time: 0.063 seconds
## Computation time: 0.353 seconds.
## Name Mean StdDev Min Max ValidObs MissingObs
## 1 Flag_Tip 0.5235631537 0.4994446659 0 1 1269673 0
rxSummary(~ Flag_Tip, dataTest)$sDataFrame
## Rows Read: 424490, Total Rows Processed: 424490, Total Chunk Time: 0.008 seconds
## Computation time: 0.010 seconds.
## Name Mean StdDev Min Max ValidObs MissingObs
## 1 Flag_Tip 0.5225234988 0.4994930227 0 1 424490 0
model <- formula(paste("Flag_Tip ~ passenger_count + trip_time_in_secs + trip_distance + total_amount"))
rxLogisticRegressionFit <- rxLogisticRegression(model, data = dataTrain)
## Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.105, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Automatically converting column 'Flag_Tip' into a factor.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.117, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.102, Transform Time: 0
## Beginning read for block: 2
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.099, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Warning: The number of threads specified in trainer arguments is larger than the concurrency factor setting of the environment. Using 2 training threads instead.
## LBFGS multi-threading will attempt to load dataset into memory. In case of out-of-memory issues, turn off multi-threading by setting trainThreads to 1.
## Beginning optimization
## num vars: 5
## improvement criterion: Mean Improvement
## L1 regularization selected 4 of 5 weights.
## Not training a calibrator because it is not needed.
## Elapsed time: 00:00:12.3652212
## Elapsed time: 00:00:00.1948485
rxFastLinearFit <- rxFastLinear(model, data = dataTrain)
## Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.1, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Automatically converting column 'Flag_Tip' into a factor.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.098, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.099, Transform Time: 0
## Beginning read for block: 2
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.098, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Using 2 threads to train.
## Automatically choosing a check frequency of 2.
## Auto-tuning parameters: maxIterations = 2.
## Auto-tuning parameters: L2 = 3,938022E-06.
## Auto-tuning parameters: L1Threshold (L1/L2) = 0.
## Using model from last iteration.
## Not training a calibrator because it is not needed.
## Elapsed time: 00:00:02.1652462
## Elapsed time: 00:00:00.0206831
rxFastTreesFit <- rxFastTrees(model, data = dataTrain)
## Warning: The number of threads specified in trainer arguments is larger than the concurrency factor setting of the environment. Using 2 training threads instead.
## Not adding a normalizer.
## Automatically converting column 'Flag_Tip' into a factor.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.1, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Making per-feature arrays
## Changing data from row-wise to column-wise
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.099, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Processed 1269673 instances
## Binning and forming Feature objects
## Reserved memory for tree learner: 121680 bytes
## Starting to train ...
## Not training a calibrator because it is not needed.
## Elapsed time: 00:00:11.2225005
rxFastForestFit <- rxFastForest(model, data = dataTrain)
## Warning: The number of threads specified in trainer arguments is larger than the concurrency factor setting of the environment. Using 2 training threads instead.
## Not adding a normalizer.
## Automatically converting column 'Flag_Tip' into a factor.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.1, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Making per-feature arrays
## Changing data from row-wise to column-wise
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.099, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Processed 1269673 instances
## Binning and forming Feature objects
## Reserved memory for tree learner: 121680 bytes
## Starting to train ...
## Warning: 2 of the boosting iterations failed to grow a tree. This is commonly because the minimum documents in leaf hyperparameter was set too high for this dataset.
## Training calibrator.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.1, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Elapsed time: 00:00:14.5551461
rxNeuralNetFit <- rxNeuralNet(model, data = dataTrain)
## Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.098, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Automatically converting column 'Flag_Tip' into a factor.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.098, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.099, Transform Time: 0
## Beginning read for block: 2
## Beginning read for block: 1
## Rows Read: 1269673, Read Time: 0.099, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Using: SSE Math
##
## ***** Net definition *****
## input Data [4];
## hidden H [100] sigmoid { // Depth 1
## from Data all;
## }
## output Result [1] sigmoid { // Depth 0
## from H all;
## }
## ***** End net definition *****
## Input count: 4
## Output count: 1
## Output Function: Sigmoid
## Loss Function: LogLoss
## PreTrainer: NoPreTrainer
## ___________________________________________________________________
## Starting training...
## Learning rate: 0,001000
## Momentum: 0,000000
## InitWtsDiameter: 0,100000
## ___________________________________________________________________
## Initializing 1 Hidden Layers, 601 Weights...
## Estimated Pre-training MeanError = 0,696093
## Iter:1/100, MeanErr=0,693611(-0,36%%), 390,81M WeightUpdates/sec
## Iter:2/100, MeanErr=0,693291(-0,05%%), 387,40M WeightUpdates/sec
## Iter:3/100, MeanErr=0,692855(-0,06%%), 375,15M WeightUpdates/sec
## Iter:4/100, MeanErr=0,691979(-0,13%%), 387,75M WeightUpdates/sec
## Iter:5/100, MeanErr=0,689888(-0,30%%), 385,49M WeightUpdates/sec
## Iter:6/100, MeanErr=0,685817(-0,59%%), 383,79M WeightUpdates/sec
## Iter:7/100, MeanErr=0,680775(-0,74%%), 382,70M WeightUpdates/sec
## Iter:8/100, MeanErr=0,677331(-0,51%%), 385,03M WeightUpdates/sec
## Iter:9/100, MeanErr=0,675852(-0,22%%), 382,97M WeightUpdates/sec
## Iter:10/100, MeanErr=0,675440(-0,06%%), 385,69M WeightUpdates/sec
## Iter:11/100, MeanErr=0,675325(-0,02%%), 390,11M WeightUpdates/sec
## Iter:12/100, MeanErr=0,675226(-0,01%%), 385,74M WeightUpdates/sec
## Iter:13/100, MeanErr=0,675272(0,01%%), 386,22M WeightUpdates/sec
## Iter:14/100, MeanErr=0,675198(-0,01%%), 385,46M WeightUpdates/sec
## Iter:15/100, MeanErr=0,675134(-0,01%%), 389,66M WeightUpdates/sec
## Iter:16/100, MeanErr=0,675098(-0,01%%), 372,16M WeightUpdates/sec
## Iter:17/100, MeanErr=0,675066(0,00%%), 384,64M WeightUpdates/sec
## Iter:18/100, MeanErr=0,675049(0,00%%), 376,91M WeightUpdates/sec
## Iter:19/100, MeanErr=0,675026(0,00%%), 386,71M WeightUpdates/sec
## Iter:20/100, MeanErr=0,674977(-0,01%%), 384,51M WeightUpdates/sec
## Iter:21/100, MeanErr=0,674924(-0,01%%), 382,25M WeightUpdates/sec
## Iter:22/100, MeanErr=0,674912(0,00%%), 385,25M WeightUpdates/sec
## Iter:23/100, MeanErr=0,674844(-0,01%%), 389,40M WeightUpdates/sec
## Iter:24/100, MeanErr=0,674823(0,00%%), 382,62M WeightUpdates/sec
## Iter:25/100, MeanErr=0,674803(0,00%%), 385,59M WeightUpdates/sec
## Iter:26/100, MeanErr=0,674746(-0,01%%), 390,01M WeightUpdates/sec
## Iter:27/100, MeanErr=0,674730(0,00%%), 385,58M WeightUpdates/sec
## Iter:28/100, MeanErr=0,674655(-0,01%%), 383,01M WeightUpdates/sec
## Iter:29/100, MeanErr=0,674642(0,00%%), 385,75M WeightUpdates/sec
## Iter:30/100, MeanErr=0,674607(-0,01%%), 382,99M WeightUpdates/sec
## Iter:31/100, MeanErr=0,674538(-0,01%%), 387,29M WeightUpdates/sec
## Iter:32/100, MeanErr=0,674536(0,00%%), 378,48M WeightUpdates/sec
## Iter:33/100, MeanErr=0,674481(-0,01%%), 376,62M WeightUpdates/sec
## Iter:34/100, MeanErr=0,674403(-0,01%%), 386,01M WeightUpdates/sec
## Iter:35/100, MeanErr=0,674381(0,00%%), 383,69M WeightUpdates/sec
## Iter:36/100, MeanErr=0,674301(-0,01%%), 386,85M WeightUpdates/sec
## Iter:37/100, MeanErr=0,674228(-0,01%%), 382,40M WeightUpdates/sec
## Iter:38/100, MeanErr=0,674160(-0,01%%), 383,37M WeightUpdates/sec
## Iter:39/100, MeanErr=0,674069(-0,01%%), 388,91M WeightUpdates/sec
## Iter:40/100, MeanErr=0,673979(-0,01%%), 383,82M WeightUpdates/sec
## Iter:41/100, MeanErr=0,673848(-0,02%%), 383,83M WeightUpdates/sec
## Iter:42/100, MeanErr=0,673775(-0,01%%), 383,35M WeightUpdates/sec
## Iter:43/100, MeanErr=0,673637(-0,02%%), 390,07M WeightUpdates/sec
## Iter:44/100, MeanErr=0,673551(-0,01%%), 392,98M WeightUpdates/sec
## Iter:45/100, MeanErr=0,673453(-0,01%%), 386,19M WeightUpdates/sec
## Iter:46/100, MeanErr=0,673382(-0,01%%), 388,55M WeightUpdates/sec
## Iter:47/100, MeanErr=0,673295(-0,01%%), 382,80M WeightUpdates/sec
## Iter:48/100, MeanErr=0,673224(-0,01%%), 375,06M WeightUpdates/sec
## Iter:49/100, MeanErr=0,673198(0,00%%), 390,59M WeightUpdates/sec
## Iter:50/100, MeanErr=0,673144(-0,01%%), 391,19M WeightUpdates/sec
## Iter:51/100, MeanErr=0,673115(0,00%%), 386,64M WeightUpdates/sec
## Iter:52/100, MeanErr=0,673088(0,00%%), 384,55M WeightUpdates/sec
## Iter:53/100, MeanErr=0,673085(0,00%%), 392,09M WeightUpdates/sec
## Iter:54/100, MeanErr=0,673028(-0,01%%), 387,47M WeightUpdates/sec
## Iter:55/100, MeanErr=0,673027(0,00%%), 384,23M WeightUpdates/sec
## Iter:56/100, MeanErr=0,672973(-0,01%%), 382,95M WeightUpdates/sec
## Iter:57/100, MeanErr=0,672930(-0,01%%), 389,52M WeightUpdates/sec
## Iter:58/100, MeanErr=0,672963(0,00%%), 383,34M WeightUpdates/sec
## Iter:59/100, MeanErr=0,672922(-0,01%%), 388,93M WeightUpdates/sec
## Iter:60/100, MeanErr=0,672896(0,00%%), 386,66M WeightUpdates/sec
## Iter:61/100, MeanErr=0,672868(0,00%%), 387,44M WeightUpdates/sec
## Iter:62/100, MeanErr=0,672845(0,00%%), 385,33M WeightUpdates/sec
## Iter:63/100, MeanErr=0,672800(-0,01%%), 372,28M WeightUpdates/sec
## Iter:64/100, MeanErr=0,672818(0,00%%), 386,99M WeightUpdates/sec
## Iter:65/100, MeanErr=0,672729(-0,01%%), 388,92M WeightUpdates/sec
## Iter:66/100, MeanErr=0,672742(0,00%%), 389,05M WeightUpdates/sec
## Iter:67/100, MeanErr=0,672702(-0,01%%), 388,50M WeightUpdates/sec
## Iter:68/100, MeanErr=0,672702(0,00%%), 387,75M WeightUpdates/sec
## Iter:69/100, MeanErr=0,672631(-0,01%%), 382,91M WeightUpdates/sec
## Iter:70/100, MeanErr=0,672621(0,00%%), 383,43M WeightUpdates/sec
## Iter:71/100, MeanErr=0,672587(-0,01%%), 386,86M WeightUpdates/sec
## Iter:72/100, MeanErr=0,672569(0,00%%), 384,59M WeightUpdates/sec
## Iter:73/100, MeanErr=0,672560(0,00%%), 384,89M WeightUpdates/sec
## Iter:74/100, MeanErr=0,672498(-0,01%%), 384,37M WeightUpdates/sec
## Iter:75/100, MeanErr=0,672502(0,00%%), 386,16M WeightUpdates/sec
## Iter:76/100, MeanErr=0,672468(-0,01%%), 387,39M WeightUpdates/sec
## Iter:77/100, MeanErr=0,672435(0,00%%), 383,83M WeightUpdates/sec
## Iter:78/100, MeanErr=0,672392(-0,01%%), 374,49M WeightUpdates/sec
## Iter:79/100, MeanErr=0,672355(-0,01%%), 379,71M WeightUpdates/sec
## Iter:80/100, MeanErr=0,672348(0,00%%), 391,53M WeightUpdates/sec
## Iter:81/100, MeanErr=0,672339(0,00%%), 382,67M WeightUpdates/sec
## Iter:82/100, MeanErr=0,672331(0,00%%), 388,71M WeightUpdates/sec
## Iter:83/100, MeanErr=0,672240(-0,01%%), 391,03M WeightUpdates/sec
## Iter:84/100, MeanErr=0,672257(0,00%%), 387,62M WeightUpdates/sec
## Iter:85/100, MeanErr=0,672213(-0,01%%), 390,87M WeightUpdates/sec
## Iter:86/100, MeanErr=0,672157(-0,01%%), 384,19M WeightUpdates/sec
## Iter:87/100, MeanErr=0,672127(0,00%%), 383,61M WeightUpdates/sec
## Iter:88/100, MeanErr=0,672117(0,00%%), 384,70M WeightUpdates/sec
## Iter:89/100, MeanErr=0,672103(0,00%%), 383,15M WeightUpdates/sec
## Iter:90/100, MeanErr=0,672072(0,00%%), 390,25M WeightUpdates/sec
## Iter:91/100, MeanErr=0,672025(-0,01%%), 392,23M WeightUpdates/sec
## Iter:92/100, MeanErr=0,671979(-0,01%%), 387,49M WeightUpdates/sec
## Iter:93/100, MeanErr=0,671955(0,00%%), 373,03M WeightUpdates/sec
## Iter:94/100, MeanErr=0,671944(0,00%%), 383,79M WeightUpdates/sec
## Iter:95/100, MeanErr=0,671884(-0,01%%), 383,42M WeightUpdates/sec
## Iter:96/100, MeanErr=0,671858(0,00%%), 381,62M WeightUpdates/sec
## Iter:97/100, MeanErr=0,671820(-0,01%%), 388,47M WeightUpdates/sec
## Iter:98/100, MeanErr=0,671772(-0,01%%), 378,51M WeightUpdates/sec
## Iter:99/100, MeanErr=0,671738(-0,01%%), 384,15M WeightUpdates/sec
## Iter:100/100, MeanErr=0,671691(-0,01%%), 392,35M WeightUpdates/sec
## Done!
## Estimated Post-training MeanError = 0,671329
## ___________________________________________________________________
## Not training a calibrator because it is not needed.
## Elapsed time: 00:03:19.9055998
fitScores <- rxPredict(rxLogisticRegressionFit, dataTest, suffix = ".rxLogisticRegression",
extraVarsToWrite = names(dataTest),
outData = tempfile(fileext = ".xdf"))
## Beginning read for block: 1
## Rows Read: 424490, Read Time: 0.615, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Elapsed time: 00:00:18.5327430
## Finished writing 424490 rows.
## Writing completed.
fitScores <- rxPredict(rxFastLinearFit, fitScores, suffix = ".rxFastLinear",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
## Beginning read for block: 1
## Rows Read: 424490, Read Time: 0.62, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Elapsed time: 00:00:18.4044550
## Finished writing 424490 rows.
## Writing completed.
fitScores <- rxPredict(rxFastTreesFit, fitScores, suffix = ".rxFastTrees",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
## Beginning read for block: 1
## Rows Read: 424490, Read Time: 0.625, Transform Time: 0.001
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Elapsed time: 00:00:20.5500508
## Finished writing 424490 rows.
## Writing completed.
fitScores <- rxPredict(rxFastForestFit, fitScores, suffix = ".rxFastForest",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
## Beginning read for block: 1
## Rows Read: 424490, Read Time: 0.633, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Elapsed time: 00:00:21.7202885
## Finished writing 424490 rows.
## Writing completed.
fitScores <- rxPredict(rxNeuralNetFit, fitScores, suffix = ".rxNeuralNet",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
## Beginning read for block: 1
## Rows Read: 424490, Read Time: 0.637, Transform Time: 0
## Beginning read for block: 2
## No rows remaining. Finished reading data set.
## Elapsed time: 00:00:18.8436828
## Finished writing 424490 rows.
## Writing completed.
# Compute the fit models's ROC curves.
fitRoc <- rxRoc("Flag_Tip", grep("Probability.", names(fitScores), value = T), fitScores)
# Plot the ROC curves and report their AUCs.
plot(fitRoc)
# Create a named list of the fit models.
fitList <-
list(rxLogisticRegression = rxLogisticRegressionFit,
rxFastLinear = rxFastLinearFit,
rxFastTrees = rxFastTreesFit,
rxFastForest = rxFastForestFit,
rxNeuralNet = rxNeuralNetFit)
# Compute the fit models's AUCs.
fitAuc <- rxAuc(fitRoc)
names(fitAuc) <- substring(names(fitAuc), nchar("Probability.") + 1)
# Find the name of the fit with the largest AUC.
bestFitName <- names(which.max(fitAuc))
# Select the fit model with the largest AUC.
bestFit <- fitList[[bestFitName]]
# Report the fit AUCs.
cat("Fit model AUCs:\n")
## Fit model AUCs:
print(fitAuc, digits = 2)
## rxFastForest.1 rxFastLinear.1 rxFastTrees.1
## 0.87 0.66 0.98
## rxLogisticRegression.1 rxNeuralNet.1
## 0.50 0.65
# Report the best fit.
cat(paste0("Best fit model with ", bestFitName,
", AUC = ", signif(fitAuc[[bestFitName]], digits = 2),
".\n"))
## Best fit model with rxFastTrees.1, AUC = 0.98.